In [45]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
In [46]:
#Create 2 binomial populations, pop1 (10,02,10000) and pop1(10,0.5,10000)
pop1 = np.random.binomial(10, 0.2, 10000)
pop2 = np.random.binomial(10,0.5, 10000)
plt.hist(pop1, alpha=0.5, label='Population 1')
plt.hist(pop2, alpha=0.5, label='Population 2')
plt.legend(loc='upper right')
plt.show()
print(pop1.mean())
print(pop2.mean())
print(pop1.std())
print(pop2.std())
In [47]:
#Create two samples, one per population of 100 datapoints each
sample1 = np.random.choice(pop1, 100, replace=True)
sample2 = np.random.choice(pop2, 100, replace=True)
plt.hist(sample1, alpha=0.5, label='sample 1')
plt.hist(sample2, alpha=0.5, label='sample 2')
plt.legend(loc='upper right')
plt.show()
In [48]:
#Calculate mean and standard deviations for each sample
print(sample1.mean())
print(sample2.mean())
print(sample1.std())
print(sample2.std())
In [49]:
#Compare samples Calculate t-value & p-value.
from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
mean1 = pop1.mean()
mean2 = pop2.mean()
print(ttest_1samp(sample1, mean1))
print(ttest_1samp(sample2, mean2))
print(ttest_ind(sample1, sample2, equal_var=False))
In [50]:
#Increase the size of your samples from 100 to 1000
#Calculate the means and standard deviations for your new samples and create histograms for each.
sample3 = np.random.choice(pop1, 1000, replace=True)
sample4 = np.random.choice(pop2, 1000, replace=True)
print(sample3.mean())
print(sample4.mean())
print(sample3.std())
print(sample4.std())
plt.hist(sample3, alpha=0.5, label='sample 3')
plt.hist(sample4, alpha=0.5, label='sample 4')
plt.legend(loc='upper right')
plt.show()
In [51]:
#Compare samples Calculate t-value & p-value.
from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
mean1 = pop1.mean()
mean2 = pop2.mean()
print(ttest_1samp(sample3, mean1))
print(ttest_1samp(sample4, mean2))
print(ttest_ind(sample4, sample3, equal_var=False))
In [52]:
#Decreasing the size of your samples to 20. What values change, and what remain the same?
sample3 = np.random.choice(pop1, 20, replace=True)
sample4 = np.random.choice(pop2, 20, replace=True)
print(sample3.mean())
print(sample4.mean())
print(sample3.std())
print(sample4.std())
plt.hist(sample3, alpha=0.5, label='sample 3')
plt.hist(sample4, alpha=0.5, label='sample 4')
plt.legend(loc='upper right')
plt.show()
In [53]:
#Compare samples: calculate t-value & p-value.
from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
mean1 = pop1.mean()
mean2 = pop2.mean()
print(ttest_1samp(sample3, mean1))
print(ttest_1samp(sample4, mean2))
print(ttest_ind(sample4, sample3, equal_var=False))
What values change, and what remain the same?
When running the one sample t test, p is always higher than 0.05, meaning that at 5% significane level the sample provides sufficient evidence to conclude that the mean of the sample is the calculated mean in all cases (for both populations)
Regarding the samples: means change, the lower the number of datapoints the less accurate the representation of the population.
T-values increase with the size while p-values tend to zero showing that the difference in means is due to the difference in the populations and not just variability.
In [54]:
#Change the population value for pop1 to 0.3
pop1 = np.random.binomial(10, 0.3, 10000)
pop2 = np.random.binomial(10,0.5, 10000)
plt.hist(pop1, alpha=0.5, label='Population 1')
plt.hist(pop2, alpha=0.5, label='Population 2')
plt.legend(loc='upper right')
plt.show()
print(pop1.mean())
print(pop2.mean())
print(pop1.std())
print(pop2.std())
In [55]:
#Samples of the new pop1 and pop2
sample5 = np.random.choice(pop1, 100, replace=True)
sample6 = np.random.choice(pop2, 100, replace=True)
print(sample5.mean())
print(sample6.mean())
print(sample5.std())
print(sample6.std())
plt.hist(sample5, alpha=0.5, label='sample 5')
plt.hist(sample6, alpha=0.5, label='sample 6')
plt.legend(loc='upper right')
plt.show()
In [56]:
#Compare samples Calculate t-value & p-value.
from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
mean1 = pop1.mean()
mean2 = pop2.mean()
print(ttest_1samp(sample5, mean1))
print(ttest_1samp(sample6, mean2))
print(ttest_ind(sample6, sample5, equal_var=False))
In [57]:
#Then change the population value p for group 1 to 0.4, and do it again
pop1 = np.random.binomial(10, 0.4, 10000)
pop2 = np.random.binomial(10,0.5, 10000)
plt.hist(pop1, alpha=0.5, label='Population 1')
plt.hist(pop2, alpha=0.5, label='Population 2')
plt.legend(loc='upper right')
plt.show()
print(pop1.mean())
print(pop2.mean())
print(pop1.std())
print(pop2.std())
In [58]:
#Samples of the new pop1 and pop2
sample5 = np.random.choice(pop1, 100, replace=True)
sample6 = np.random.choice(pop2, 100, replace=True)
print(sample5.mean())
print(sample6.mean())
print(sample5.std())
print(sample6.std())
plt.hist(sample5, alpha=0.5, label='sample 5')
plt.hist(sample6, alpha=0.5, label='sample 6')
plt.legend(loc='upper right')
plt.show()
In [59]:
#Compare samples Calculate t-value & p-value.
from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
mean1 = pop1.mean()
mean2 = pop2.mean()
print(ttest_1samp(sample5, mean1))
print(ttest_1samp(sample6, mean2))
print(ttest_ind(sample6, sample5, equal_var=False))
What changes, and why?
The t-value decreases in the second case (when p1=0.4, p2=0.5) and the p-value is much higher. The t value decreases because the difference between means is lower and the increase if the p-value shows the that the noise due to variability is growing in weach case
In [69]:
#Change the distribution of your populations from binomial to a distribution of your choice
pop3 = np.random.standard_t(25, 10000)
pop4 = logistic = np.random.logistic(9,2, 10000)
plt.hist(pop1, alpha=0.5, label='Population 3')
plt.hist(pop2, alpha=0.5, label='Population 4')
plt.legend(loc='upper right')
plt.show()
print(pop1.mean())
print(pop2.mean())
print(pop1.std())
print(pop2.std())
In [70]:
#Samples of the new pop1 and pop2.
sample7 = np.random.choice(pop3, 100, replace=True)
sample8 = np.random.choice(pop4, 100, replace=True)
print(sample7.mean())
print(sample8.mean())
print(sample7.std())
print(sample8.std())
plt.hist(sample7, alpha=0.5, label='sample 7')
plt.hist(sample8, alpha=0.5, label='sample 8')
plt.legend(loc='upper right')
plt.show()
In [71]:
#Compare samples Calculate t-value & p-value.
from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
mean1 = pop3.mean()
mean2 = pop4.mean()
print(ttest_1samp(sample7, mean1))
print(ttest_1samp(sample8, mean2))
print(ttest_ind(sample8, sample7, equal_var=False))
Do the sample mean values still accurately represent the population values?
In this case, the distance between means is 21 times the standard error. Additionally, the p-value is close to zero therefore the difference we see is due to the difference between populations.